{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a77e4616-c43b-42d4-88bf-40eeaab5e6cf",
   "metadata": {},
   "source": [
    "# L6: Build Your Own RAG Bot"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dc2f9ec6-b56b-444c-9d9f-6f0b39973450",
   "metadata": {},
   "source": [
    "<p style=\"background-color:#fff6e4; padding:15px; border-width:3px; border-color:#f5ecda; border-style:solid; border-radius:6px\"> ⏳ <b>Note <code>(Kernel Starting)</code>:</b> This notebook takes about 30 seconds to be ready to use. You may start and watch the video while you wait.</p>\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "30697566-ee92-47f1-a8eb-c17848e9a03b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Warning control\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "916a2cac-7bc8-4cd8-ae26-3d1076e8384b",
   "metadata": {},
   "outputs": [],
   "source": [
    "from unstructured_client import UnstructuredClient\n",
    "from unstructured_client.models import shared\n",
    "from unstructured_client.models.errors import SDKError\n",
    "\n",
    "from unstructured.chunking.title import chunk_by_title\n",
    "from unstructured.partition.md import partition_md\n",
    "from unstructured.partition.pptx import partition_pptx\n",
    "from unstructured.staging.base import dict_to_elements\n",
    "\n",
    "import chromadb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5e7b106a-5144-4a33-9a85-6a2c2f9faa36",
   "metadata": {},
   "outputs": [],
   "source": [
    "from Utils import Utils\n",
    "utils = Utils()\n",
    "\n",
    "DLAI_API_KEY = utils.get_dlai_api_key()\n",
    "DLAI_API_URL = utils.get_dlai_url()\n",
    "\n",
    "s = UnstructuredClient(\n",
    "    api_key_auth=DLAI_API_KEY,\n",
    "    server_url=DLAI_API_URL,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d1693200-c7b2-4813-81c3-796eba999b3a",
   "metadata": {},
   "source": [
    "<p style=\"background-color:#fff6ff; padding:15px; border-width:3px; border-color:#efe6ef; border-style:solid; border-radius:6px\"> 💻 &nbsp; <b>Access Utils File and Helper Functions:</b> To access helper functions and other related files for this notebook, 1) click on the <em>\"View\"</em> option on the top menu of the notebook and then 2) click on <em>\"File Browser\"</em>. For more help, please see the <em>\"Appendix - Tips and Help\"</em> Lesson.</p>\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ed7e6155-d37d-4fcf-8413-c676c947ad51",
   "metadata": {},
   "source": [
    "## Example Application: Question Answering about the Donut Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1ee41a37-61ac-468c-9d47-dab2e671f85b",
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.display import Image\n",
    "Image(filename='images/donut_paper.png', height=400, width=400)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "76538d6f-bcdb-4bd7-a006-056a86bd1f11",
   "metadata": {},
   "outputs": [],
   "source": [
    "Image(filename='images/donut_slide.png', height=400, width=400) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e626fd92-71e8-4af2-ae7b-4d22939945b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "Image(filename='images/donut_readme.png', height=600, width=600) "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "76132dfe-7360-4147-a5b1-0e571917e08b",
   "metadata": {},
   "source": [
    "## Preprocess the PDF"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7d647641-fab0-4316-b4f3-bfee065b1273",
   "metadata": {},
   "source": [
    "<p style=\"background-color:#fff6e4; padding:15px; border-width:3px; border-color:#f5ecda; border-style:solid; border-radius:6px\"> ⏳ <b>Note <code>(Wait Time)</code>:</b> The following block can take a few minutes to complete.</p>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9b75d977-7d80-427d-9b24-4fcf1384fc7f",
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = \"example_files/donut_paper.pdf\"\n",
    "\n",
    "with open(filename, \"rb\") as f:\n",
    "    files=shared.Files(\n",
    "        content=f.read(),\n",
    "        file_name=filename,\n",
    "    )\n",
    "\n",
    "req = shared.PartitionParameters(\n",
    "    files=files,\n",
    "    strategy=\"hi_res\",\n",
    "    hi_res_model_name=\"yolox\",\n",
    "    pdf_infer_table_structure=True,\n",
    "    skip_infer_table_types=[],\n",
    ")\n",
    "\n",
    "try:\n",
    "    resp = s.general.partition(req)\n",
    "    pdf_elements = dict_to_elements(resp.elements)\n",
    "except SDKError as e:\n",
    "    print(e)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8ed8e92e-5d85-4d9e-9867-6ed8a190b9d5",
   "metadata": {},
   "outputs": [],
   "source": [
    "pdf_elements[0].to_dict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f50942e5-33e4-43d9-ad89-a3840a944ed8",
   "metadata": {},
   "outputs": [],
   "source": [
    "tables = [el for el in pdf_elements if el.category == \"Table\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a733c9d8-d6a5-4bad-8ba0-b1393165b6e7",
   "metadata": {},
   "outputs": [],
   "source": [
    "table_html = tables[0].metadata.text_as_html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "27481fdb-e8d1-430d-8919-46205765c044",
   "metadata": {},
   "outputs": [],
   "source": [
    "from io import StringIO \n",
    "from lxml import etree\n",
    "\n",
    "parser = etree.XMLParser(remove_blank_text=True)\n",
    "file_obj = StringIO(table_html)\n",
    "tree = etree.parse(file_obj, parser)\n",
    "print(etree.tostring(tree, pretty_print=True).decode())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "73044344-ee8e-4287-9f65-b8f9d9b258b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "Image(filename='images/donut_references.png', height=400, width=400) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ffd80c89-5c09-4b89-b36b-9017d334499a",
   "metadata": {},
   "outputs": [],
   "source": [
    "reference_title = [\n",
    "    el for el in pdf_elements\n",
    "    if el.text == \"References\"\n",
    "    and el.category == \"Title\"\n",
    "][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c16d2b4-2039-4f08-ac63-9a4f27d0bc1f",
   "metadata": {},
   "outputs": [],
   "source": [
    "reference_title.to_dict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "303b9220-6367-4acc-a301-72e90620ed6f",
   "metadata": {},
   "outputs": [],
   "source": [
    "references_id = reference_title.id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2b1b9656-0018-4936-a9c9-3beac7a7c610",
   "metadata": {},
   "outputs": [],
   "source": [
    "for element in pdf_elements:\n",
    "    if element.metadata.parent_id == references_id:\n",
    "        print(element)\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c7c9c2ce-433c-493f-8d01-3d257bf1eff1",
   "metadata": {},
   "outputs": [],
   "source": [
    "pdf_elements = [el for el in pdf_elements if el.metadata.parent_id != references_id]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b3495de3-1ad8-4f81-b979-d67de01bd8d2",
   "metadata": {},
   "source": [
    "### Filter out headers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a06b26de-c6f6-4790-8863-00750a4db22d",
   "metadata": {},
   "outputs": [],
   "source": [
    "Image(filename='images/donut_headers.png', height=400, width=400) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "29593c8d-a579-454f-872f-9001caa5613c",
   "metadata": {},
   "outputs": [],
   "source": [
    "headers = [el for el in pdf_elements if el.category == \"Header\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2e2a006f-9125-4c13-a2ee-a86452b59b2b",
   "metadata": {},
   "outputs": [],
   "source": [
    "headers[1].to_dict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4c40d66a-4be5-4051-8226-5be421118ab3",
   "metadata": {},
   "outputs": [],
   "source": [
    "pdf_elements = [el for el in pdf_elements if el.category != \"Header\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1626e1ee-7354-485b-b2ce-64d65df355e4",
   "metadata": {},
   "source": [
    "## Preprocess the PowerPoint Slide"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3c1bc612-e64e-4b9a-84de-90763c1ec064",
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = \"example_files/donut_slide.pptx\"\n",
    "pptx_elements = partition_pptx(filename=filename)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "527495ff-0e84-400e-b6a6-982a5288f2f5",
   "metadata": {},
   "source": [
    "## Preprocess the README"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b2e35d92-78bc-46be-985e-6baba4830b12",
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = \"example_files/donut_readme.md\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "946d2081-9070-423b-99f0-ce260e9a6491",
   "metadata": {},
   "outputs": [],
   "source": [
    "md_elements = partition_md(filename=filename)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b5533a29-62fe-432b-bfa0-499d4029fdbf",
   "metadata": {},
   "source": [
    "## Load the Documents into the Vector DB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6b739a36-8f61-46ac-861b-641f56b140f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "elements = chunk_by_title(pdf_elements + pptx_elements + md_elements)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a87e5924-6c42-47ab-8cd1-2c22caed71c6",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_community.vectorstores import Chroma\n",
    "from langchain_core.documents import Document\n",
    "from langchain_openai import OpenAIEmbeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9af0f348-0697-4bed-8f97-a0ee03475ffe",
   "metadata": {},
   "outputs": [],
   "source": [
    "documents = []\n",
    "for element in elements:\n",
    "    metadata = element.metadata.to_dict()\n",
    "    del metadata[\"languages\"]\n",
    "    metadata[\"source\"] = metadata[\"filename\"]\n",
    "    documents.append(Document(page_content=element.text, metadata=metadata))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "43ce7dc4-a23f-4024-b647-1eae469efc77",
   "metadata": {},
   "outputs": [],
   "source": [
    "embeddings = OpenAIEmbeddings()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b2bc691d-18e8-4833-9009-e13f7fa13e0e",
   "metadata": {},
   "outputs": [],
   "source": [
    "vectorstore = Chroma.from_documents(documents, embeddings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a775b466-16af-4bf1-94d5-c96563ce50d0",
   "metadata": {},
   "outputs": [],
   "source": [
    "retriever = vectorstore.as_retriever(\n",
    "    search_type=\"similarity\",\n",
    "    search_kwargs={\"k\": 6}\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6b3a5fa6-7169-4f1f-aabd-15eec4da8f53",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.prompts.prompt import PromptTemplate\n",
    "from langchain_openai import OpenAI\n",
    "from langchain.chains import ConversationalRetrievalChain, LLMChain\n",
    "from langchain.chains.qa_with_sources import load_qa_with_sources_chain"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "238ea8c4-c099-490a-877c-2050b65fd2b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "template = \"\"\"You are an AI assistant for answering questions about the Donut document understanding model.\n",
    "You are given the following extracted parts of a long document and a question. Provide a conversational answer.\n",
    "If you don't know the answer, just say \"Hmm, I'm not sure.\" Don't try to make up an answer.\n",
    "If the question is not about Donut, politely inform them that you are tuned to only answer questions about Donut.\n",
    "Question: {question}\n",
    "=========\n",
    "{context}\n",
    "=========\n",
    "Answer in Markdown:\"\"\"\n",
    "prompt = PromptTemplate(template=template, input_variables=[\"question\", \"context\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "42096f06-ae93-4d5d-acb6-02b760745b9e",
   "metadata": {},
   "outputs": [],
   "source": [
    "llm = OpenAI(temperature=0)\n",
    "\n",
    "doc_chain = load_qa_with_sources_chain(llm, chain_type=\"map_reduce\")\n",
    "question_generator_chain = LLMChain(llm=llm, prompt=prompt)\n",
    "qa_chain = ConversationalRetrievalChain(\n",
    "    retriever=retriever,\n",
    "    question_generator=question_generator_chain,\n",
    "    combine_docs_chain=doc_chain,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d9329374-1a3f-44aa-8cd0-14a6d8bcae7e",
   "metadata": {},
   "outputs": [],
   "source": [
    "qa_chain.invoke({\n",
    "    \"question\": \"How does Donut compare to other document understanding models?\",\n",
    "    \"chat_history\": []\n",
    "})[\"answer\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c2f3d9ab-9cb4-4b64-8ed6-db3467f2a548",
   "metadata": {},
   "outputs": [],
   "source": [
    "filter_retriever = vectorstore.as_retriever(\n",
    "    search_type=\"similarity\",\n",
    "    search_kwargs={\"k\": 1, \"filter\": {\"source\": \"donut_readme.md\"}}\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ebca3607-5cc0-46e3-9f1e-ef60955d3abd",
   "metadata": {},
   "outputs": [],
   "source": [
    "filter_chain = ConversationalRetrievalChain(\n",
    "    retriever=filter_retriever,\n",
    "    question_generator=question_generator_chain,\n",
    "    combine_docs_chain=doc_chain,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "67c0ff39-291e-4cd1-9aef-b2db960a728b",
   "metadata": {},
   "outputs": [],
   "source": [
    "filter_chain.invoke({\n",
    "    \"question\": \"How do I classify documents with DONUT?\",\n",
    "    \"chat_history\": [],\n",
    "    \"filter\": filter,\n",
    "})[\"answer\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "db67aabd-fa7d-4786-912b-7f571e197967",
   "metadata": {},
   "source": [
    "## Work With Your Own Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "012b94a3-d3ba-46d9-86d0-1dc989c4fd80",
   "metadata": {},
   "outputs": [],
   "source": [
    "import panel as pn\n",
    "#import param\n",
    "from Utils import upld_file\n",
    "pn.extension()\n",
    "\n",
    "upld_widget = upld_file()\n",
    "pn.Row(upld_widget.widget_file_upload)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ab695a6f-b796-499a-a32f-2c9194922890",
   "metadata": {},
   "source": [
    "<p style=\"background-color:#fff6e4; padding:15px; border-width:3px; border-color:#f5ecda; border-style:solid; border-radius:6px\"> 🖥 &nbsp; <b>Note:</b> If the file upload interface isn't functioning properly, the issue may be related to your browser version. In such a case, please ensure your browser is updated to the latest version, or try using a different browser.</p>\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "640d267b-f602-4d62-abef-3bbe7803a348",
   "metadata": {},
   "outputs": [],
   "source": [
    "!ls ./example_files"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e91e5209-1445-4cf0-90b8-f6e5f97dffa2",
   "metadata": {},
   "source": [
    "<p style=\"background-color:#fff6ff; padding:15px; border-width:3px; border-color:#efe6ef; border-style:solid; border-radius:6px\"> 💻 &nbsp; <b>Uploading Your Own File - Method 2:</b> To upload your own files, you can also 1) click on the <em>\"View\"</em> option on the top menu of the notebook and then 2) click on <em>\"File Browser\"</em>. Then 3) click on <em>\"Upload\"</em> button to upload your files. For more help, please see the <em>\"Appendix - Tips and Help\"</em> Lesson.</p>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "de556e42-9311-4f6c-8a94-bae6238b74e8",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1a21f972-39e1-4765-a5bc-00f1feb043c2",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a10d548f-3a5c-4567-937a-ec65527c1d68",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "557b7e4c-c981-4bad-98fd-d9f9d4b7df57",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "49e096dd-798e-4d2d-a225-7d55bf4a43e0",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ebc0b21b-25c9-474c-9377-8c658ee1eecb",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5dd34966-40e7-4099-9120-2084d5ac291b",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "64114f95-ca0e-4de8-a335-fa8a4849da96",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
